Objective:
GSE151158,GSE58979,GSE63067,GSE89632 are merged and split into training and testing data for the random forest.With these dataset the parameters for the random forest were tuned.GSM836223 is used for validating the model.
library(randomForest)
library(caret)
library(e1071)
library(alookr)
library(dplyr)
library(pROC)
library(ggplot2)
library(ggpubr)
library(ggsignif)
library(enrichR)
library(DT)
library(corrplot)
library(dplyr)
library(tibble)
load the data
data<-read.csv("dataset_model.csv",header = T,row.names = 1)
validation<-read.csv("validation.csv",header = T,row.names = 1)
data_scaled<-scale(data[,2:ncol(data)])
data_scaled<-as.data.frame(data_scaled)
data_scaled<-add_column(data_scaled, sample=data$sample, .before = 1)
data<-data_scaled
#write.csv(data,"significant_genes.csv")
head(data)
## sample BTK PTPN6 SERPING1 BAX ITGAE
## GSM2385720 steatosis 1.08127960 0.1322743 -1.2408517 -0.9896539 1.68591912
## GSM2385722 steatosis 0.38673710 -1.7828508 0.1708034 -0.0515581 1.89125923
## GSM2385723 steatosis -2.03642411 -1.8695068 0.8112607 -1.2316064 0.07979388
## GSM2385724 steatosis 0.36924723 -0.6968099 -0.9745863 -1.1414650 1.47989525
## GSM2385725 steatosis 0.01993391 -0.1333171 -1.1731216 -1.1803881 1.04056815
## GSM2385726 steatosis 3.08558805 0.5342087 -2.8857307 -1.1289022 2.20415261
## IL1RAP MSR1 TNFRSF14 IL15 B2M TLR1
## GSM2385720 0.327926095 0.4027924 0.3160169 1.9404703 -0.3462406 2.8056676
## GSM2385722 0.145337497 1.2519478 0.4940662 1.2815957 -0.1870264 1.2717048
## GSM2385723 -1.460963433 -0.2745521 0.9335067 -0.3406963 -0.2991018 -1.8724132
## GSM2385724 -0.352411648 0.6069674 1.2074779 1.6431957 -0.2961883 1.7921221
## GSM2385725 -0.504099484 0.5650986 1.2900667 0.7437252 -0.3307479 0.6245865
## GSM2385726 -0.002064212 0.6567890 0.6967889 3.2933405 0.8288701 3.6738902
## HPRT1 CX3CR1 TOLLIP C9 IFIH1 GAPDH
## GSM2385720 0.1060220 1.2821324 -1.1293317 0.7533798 0.9998472 0.68726695
## GSM2385722 0.4387493 0.6580390 0.2812988 0.2609796 1.0355334 0.53900779
## GSM2385723 -0.8278678 -0.2589630 1.4676221 -0.3615333 -0.3201400 0.46547039
## GSM2385724 -0.8597843 1.2250265 -0.1890598 0.2922215 1.2900300 0.07447177
## GSM2385725 -1.2341933 0.8924771 0.7710285 0.2947479 0.7312412 0.13997290
## GSM2385726 0.1581342 1.7473101 -2.0158677 -0.2809214 1.2425143 0.86849115
## C4BPA
## GSM2385720 0.46693864
## GSM2385722 0.22515669
## GSM2385723 -0.20885698
## GSM2385724 -0.12497344
## GSM2385725 0.02156057
## GSM2385726 -0.25109679
summary(data)
## sample BTK PTPN6 SERPING1
## Length:143 Min. :-3.76807 Min. :-2.40889 Min. :-2.8857
## Class :character 1st Qu.:-0.63367 1st Qu.:-0.68114 1st Qu.:-0.6324
## Mode :character Median : 0.01993 Median : 0.09588 Median : 0.1449
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.61370 3rd Qu.: 0.74178 3rd Qu.: 0.6071
## Max. : 3.08559 Max. : 2.85415 Max. : 2.4024
## BAX ITGAE IL1RAP MSR1
## Min. :-2.5414 Min. :-3.21674 Min. :-3.428031 Min. :-3.406505
## 1st Qu.:-0.6528 1st Qu.:-0.64122 1st Qu.:-0.506883 1st Qu.:-0.635337
## Median :-0.0618 Median :-0.04877 Median :-0.002064 Median : 0.003534
## Mean : 0.0000 Mean : 0.00000 Mean : 0.000000 Mean : 0.000000
## 3rd Qu.: 0.8257 3rd Qu.: 0.64536 3rd Qu.: 0.472827 3rd Qu.: 0.639954
## Max. : 2.2700 Max. : 2.96227 Max. : 2.716055 Max. : 2.452036
## TNFRSF14 IL15 B2M TLR1
## Min. :-3.89862 Min. :-3.1023 Min. :-2.5241 Min. :-1.94243
## 1st Qu.:-0.54319 1st Qu.:-0.5648 1st Qu.:-0.5197 1st Qu.:-0.62100
## Median : 0.06174 Median :-0.1349 Median :-0.1119 Median : 0.02295
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.64320 3rd Qu.: 0.5188 3rd Qu.: 0.4584 3rd Qu.: 0.47069
## Max. : 2.21249 Max. : 3.2933 Max. : 3.3227 Max. : 3.67389
## HPRT1 CX3CR1 TOLLIP C9
## Min. :-5.26452 Min. :-3.43118 Min. :-3.57206 Min. :-3.54306
## 1st Qu.:-0.60502 1st Qu.:-0.49653 1st Qu.:-0.53748 1st Qu.:-0.62527
## Median : 0.05058 Median : 0.08593 Median : 0.08156 Median : 0.03723
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.49474 3rd Qu.: 0.67503 3rd Qu.: 0.46771 3rd Qu.: 0.56708
## Max. : 2.44572 Max. : 2.05143 Max. : 3.18755 Max. : 4.98057
## IFIH1 GAPDH C4BPA
## Min. :-6.36339 Min. :-2.65233 Min. :-1.7778
## 1st Qu.:-0.38866 1st Qu.:-0.60569 1st Qu.:-0.5082
## Median :-0.01679 Median : 0.05987 Median : 0.1222
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.47957 3rd Qu.: 0.53692 3rd Qu.: 0.4495
## Max. : 4.35314 Max. : 5.06687 Max. : 4.9840
validation_scaled<-scale(validation[,2:ncol(validation)])
validation_scaled<-as.data.frame(validation_scaled)
validation_scaled<-add_column(validation_scaled, sample=validation$sample, .before = 1)
validation<-validation_scaled
head(validation)
## sample GAPDH BTK PTPN6 SERPING1 BAX
## GSM836223 steatosis 0.02387721 -1.33068224 -0.6623408 0.5684840 1.05387898
## GSM836224 steatosis 2.24668729 0.02214979 0.4669600 1.1815302 -0.86003240
## GSM836225 steatosis 0.12327816 -0.54412761 -0.4170409 -0.6543308 -0.27697749
## GSM836227 steatosis 0.10458447 -0.69820634 -0.2269735 -0.6983119 -0.68515513
## GSM836237 steatosis 0.71130467 -1.18222884 0.3959176 -0.7417644 -0.09105392
## GSM836238 steatosis 0.51901991 -1.47829753 -0.1557869 -0.5699974 -1.96562297
## ITGAE IL1RAP MSR1 TNFRSF14 IL15 B2M
## GSM836223 -1.9535168 0.8583664 -0.7309989 -1.7229685 -2.4391517 -1.3206688
## GSM836224 -0.9180950 -1.5220980 -0.9279548 1.8403154 -0.8057021 -0.7162281
## GSM836225 -0.9199808 -1.1769914 -1.0773319 0.3327671 -0.3366715 -1.1009666
## GSM836227 -0.4511072 -1.1910689 -0.6349280 1.2523891 0.3247614 -0.2748755
## GSM836237 -0.9658711 -0.6357247 -1.1750510 0.6396480 1.0945745 -0.6372185
## GSM836238 -0.8189395 -0.3916954 -0.9834789 0.8400943 0.8760299 -0.3910408
## TLR1 HPRT1 CX3CR1 TOLLIP C9 IFIH1
## GSM836223 -1.5661492 -0.1520123 -1.25637577 -0.2204031 -0.7322845 -1.7301328
## GSM836224 -1.4572980 -1.9667093 0.07733541 1.9103593 0.2410451 -1.1699885
## GSM836225 -1.0289664 0.5494917 -0.95228147 0.0251141 -0.2698430 -0.2242031
## GSM836227 -1.0332409 -0.4032159 -0.24575007 1.0317068 -1.0914814 -0.5475796
## GSM836237 -1.2092413 -0.4771169 -0.84695676 1.2561434 -0.7836355 -0.5628154
## GSM836238 -0.9128578 -0.4025952 -2.29127661 1.4100057 -0.3789646 -1.3749545
## C4BPA
## GSM836223 -0.6285687
## GSM836224 0.2540964
## GSM836225 -0.8100016
## GSM836227 -2.0302002
## GSM836237 -0.7502680
## GSM836238 -0.1887388
summary(validation)
## sample GAPDH BTK PTPN6
## Length:32 Min. :-1.2385 Min. :-1.8163 Min. :-1.4473
## Class :character 1st Qu.:-0.6825 1st Qu.:-0.8423 1st Qu.:-0.7291
## Mode :character Median :-0.1798 Median : 0.2320 Median :-0.1914
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.3368 3rd Qu.: 0.6098 3rd Qu.: 0.8136
## Max. : 3.1212 Max. : 2.4661 Max. : 1.8934
## SERPING1 BAX ITGAE IL1RAP
## Min. :-1.4424 Min. :-1.9656 Min. :-1.9535 Min. :-1.73477
## 1st Qu.:-0.7310 1st Qu.:-0.6289 1st Qu.:-0.8420 1st Qu.:-0.67820
## Median :-0.3446 Median :-0.1944 Median :-0.1155 Median : 0.06399
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.5226 3rd Qu.: 0.4022 3rd Qu.: 0.7614 3rd Qu.: 0.84223
## Max. : 2.1967 Max. : 3.1260 Max. : 2.3499 Max. : 2.26158
## MSR1 TNFRSF14 IL15 B2M
## Min. :-2.0704 Min. :-1.7306 Min. :-2.4392 Min. :-2.6052
## 1st Qu.:-0.7531 1st Qu.:-0.7266 1st Qu.:-0.7307 1st Qu.:-0.7810
## Median :-0.1188 Median :-0.2578 Median : 0.3370 Median : 0.1506
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5935 3rd Qu.: 0.8701 3rd Qu.: 0.6594 3rd Qu.: 0.6504
## Max. : 2.0162 Max. : 1.8403 Max. : 1.6772 Max. : 2.1734
## TLR1 HPRT1 CX3CR1 TOLLIP
## Min. :-1.5661 Min. :-1.96671 Min. :-2.29128 Min. :-1.943810
## 1st Qu.:-0.8440 1st Qu.:-0.44580 1st Qu.:-0.76524 1st Qu.:-0.726506
## Median :-0.1844 Median :-0.05742 Median : 0.01552 Median :-0.001058
## Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.8096 3rd Qu.: 0.60101 3rd Qu.: 0.67865 3rd Qu.: 0.587332
## Max. : 2.1114 Max. : 2.19022 Max. : 2.02822 Max. : 1.947853
## C9 IFIH1 C4BPA
## Min. :-1.6159 Min. :-1.73013 Min. :-2.0302
## 1st Qu.:-0.7757 1st Qu.:-0.85155 1st Qu.:-0.6590
## Median :-0.2511 Median : 0.01446 Median : 0.1429
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.4430 3rd Qu.: 0.78302 3rd Qu.: 0.5611
## Max. : 2.9267 Max. : 1.83986 Max. : 1.9394
lets split the dataset
sb <- data %>%
split_by(sample, seed = 6534)
attr_names <- names(attributes(sb))
sb_attr <- attributes(sb)
observe how the target category is separated
sb %>%
compare_target_category()
## NULL
plot the sample
sb %>%
compare_plot("sample")
compare the variables in the train and test set
sb %>%
compare_target_numeric()
## # A tibble: 18 x 7
## variable train_mean test_mean train_sd test_sd train_z test_z
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 B2M -0.00973 0.0226 0.966 1.09 -0.0101 0.0209
## 2 BAX 0.108 -0.251 0.935 1.11 0.116 -0.227
## 3 BTK 0.0617 -0.144 0.961 1.08 0.0642 -0.133
## 4 C4BPA -0.0566 0.132 0.729 1.45 -0.0776 0.0908
## 5 C9 -0.0315 0.0734 0.941 1.13 -0.0335 0.0647
## 6 CX3CR1 -0.00147 0.00343 0.991 1.03 -0.00149 0.00332
## 7 GAPDH 0.0469 -0.109 1.00 1.00 0.0469 -0.109
## 8 HPRT1 -0.0198 0.0459 0.946 1.13 -0.0209 0.0408
## 9 IFIH1 0.0157 -0.0364 1.05 0.889 0.0149 -0.0409
## 10 IL15 0.0214 -0.0498 1.00 1.00 0.0213 -0.0497
## 11 IL1RAP -0.0370 0.0860 0.965 1.09 -0.0383 0.0792
## 12 ITGAE 0.0287 -0.0667 0.983 1.05 0.0292 -0.0637
## 13 MSR1 0.0208 -0.0484 0.996 1.02 0.0209 -0.0475
## 14 PTPN6 0.108 -0.251 0.969 1.04 0.112 -0.242
## 15 SERPING1 0.0187 -0.0434 0.991 1.03 0.0189 -0.0421
## 16 TLR1 0.0545 -0.127 1.04 0.894 0.0523 -0.142
## 17 TNFRSF14 -0.0377 0.0877 0.998 1.01 -0.0378 0.0869
## 18 TOLLIP -0.0326 0.0759 1.04 0.898 -0.0313 0.0845
Extract the train and test dataset
train <- sb %>%
extract_set(set = "train")
test <- sb %>%
extract_set(set = "test")
set up the control .
In this case the repeated 10 cross validation approach is applied which gets repeated for 5 times
trControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
search = "grid",
summaryFunction=twoClassSummary,
classProbs=TRUE,
savePredictions = T)
build the model with default values initially and observe the results
set.seed(1234)
#Run the model
rf_default <- train(sample~.,
data = train,
method = "rf",
metric = "ROC",
trControl = trControl)
# Print the results
print(rf_default)
## Random Forest
##
## 100 samples
## 18 predictor
## 2 classes: 'control', 'steatosis'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 89, 90, 90, 90, 90, 90, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8511706 0.5850000 0.9171429
## 10 0.8673810 0.6200000 0.8976190
## 18 0.8609921 0.6383333 0.8723810
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
Now lets extract the best mtry
set.seed(1234)
tuneGrid <- expand.grid(.mtry = c(1: 15))
rf_mtry <- train(sample~.,
data = train,
method = "rf",
metric = "ROC",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 50)
print(rf_mtry)
## Random Forest
##
## 100 samples
## 18 predictor
## 2 classes: 'control', 'steatosis'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 89, 90, 90, 90, 90, 90, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 1 0.8454167 0.5550000 0.9052381
## 2 0.8245238 0.6100000 0.8942857
## 3 0.8417857 0.6250000 0.8923810
## 4 0.8472619 0.5933333 0.8880952
## 5 0.8494246 0.6050000 0.8957143
## 6 0.8428770 0.5950000 0.8728571
## 7 0.8441865 0.6050000 0.8795238
## 8 0.8415476 0.6233333 0.8938095
## 9 0.8426786 0.5950000 0.8766667
## 10 0.8497619 0.6200000 0.8885714
## 11 0.8449802 0.6383333 0.8733333
## 12 0.8532540 0.6266667 0.8604762
## 13 0.8500595 0.6500000 0.8733333
## 14 0.8388889 0.6150000 0.8733333
## 15 0.8384722 0.6366667 0.8542857
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 12.
plot(rf_mtry)
#store the mtry to compare it with the other variable which we are going to
best_mtry <- rf_mtry$bestTune$mtry
print(best_mtry)
## [1] 12
next lets train the maxnodes
set.seed(1234)
store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5:15)) {
set.seed(1234)
rf_maxnode <- train(sample~.,
data = train,
method = "rf",
metric = "ROC",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = maxnodes,
ntree = 50)
current_iteration <- toString(maxnodes)
store_maxnode[[current_iteration]] <- rf_maxnode
}
results_mtry <- resamples(store_maxnode)
summary(results_mtry)
##
## Call:
## summary.resamples(object = results_mtry)
##
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.6875000 0.7857143 0.8660714 0.8665476 0.9627976 1 0
## 6 0.6458333 0.7544643 0.8750000 0.8570238 0.9568452 1 0
## 7 0.6666667 0.7500000 0.8333333 0.8514683 0.9635417 1 0
## 8 0.6190476 0.7703373 0.8437500 0.8516865 0.9427083 1 0
## 9 0.6250000 0.7544643 0.8333333 0.8516865 0.9739583 1 0
## 10 0.6250000 0.7500000 0.8333333 0.8402579 0.9613095 1 0
## 11 0.6250000 0.7500000 0.8333333 0.8453175 0.9754464 1 0
## 12 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464 1 0
## 13 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464 1 0
## 14 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464 1 0
## 15 0.6250000 0.7500000 0.8333333 0.8462698 0.9754464 1 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.00 0.5 0.6666667 0.6416667 0.7500 1 0
## 6 0.00 0.5 0.6666667 0.6400000 0.7500 1 0
## 7 0.25 0.5 0.6666667 0.6500000 0.7500 1 0
## 8 0.00 0.5 0.6666667 0.6500000 0.7500 1 0
## 9 0.25 0.5 0.6666667 0.6500000 0.7500 1 0
## 10 0.25 0.5 0.6666667 0.6316667 0.7500 1 0
## 11 0.25 0.5 0.6666667 0.6500000 0.9375 1 0
## 12 0.00 0.5 0.6666667 0.6366667 0.7500 1 0
## 13 0.00 0.5 0.6666667 0.6366667 0.7500 1 0
## 14 0.00 0.5 0.6666667 0.6366667 0.7500 1 0
## 15 0.00 0.5 0.6666667 0.6366667 0.7500 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.5000000 0.8333333 0.8571429 0.8790476 1 1 0
## 6 0.5000000 0.8333333 0.8571429 0.8761905 1 1 0
## 7 0.5000000 0.8333333 0.8571429 0.8823810 1 1 0
## 8 0.5000000 0.8333333 0.9285714 0.8819048 1 1 0
## 9 0.5000000 0.8333333 0.8571429 0.8795238 1 1 0
## 10 0.3333333 0.8333333 0.8571429 0.8704762 1 1 0
## 11 0.3333333 0.8333333 0.8571429 0.8766667 1 1 0
## 12 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 13 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 14 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 15 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
lets try increasing the node and see if there is any change.
set.seed(1234)
store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(20: 30)) {
set.seed(1234)
rf_maxnode <- train(sample~.,
data = train,
method = "rf",
metric = "ROC",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = maxnodes,
ntree = 50)
key <- toString(maxnodes)
store_maxnode[[key]] <- rf_maxnode
}
results_node <- resamples(store_maxnode)
summary(results_node)
##
## Call:
## summary.resamples(object = results_node)
##
## Models: 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 20 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 21 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 22 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 23 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 24 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 25 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 26 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 27 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 28 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 29 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
## 30 0.625 0.75 0.8333333 0.8462698 0.9754464 1 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 20 0 0.5 0.6666667 0.6366667 0.75 1 0
## 21 0 0.5 0.6666667 0.6366667 0.75 1 0
## 22 0 0.5 0.6666667 0.6366667 0.75 1 0
## 23 0 0.5 0.6666667 0.6366667 0.75 1 0
## 24 0 0.5 0.6666667 0.6366667 0.75 1 0
## 25 0 0.5 0.6666667 0.6366667 0.75 1 0
## 26 0 0.5 0.6666667 0.6366667 0.75 1 0
## 27 0 0.5 0.6666667 0.6366667 0.75 1 0
## 28 0 0.5 0.6666667 0.6366667 0.75 1 0
## 29 0 0.5 0.6666667 0.6366667 0.75 1 0
## 30 0 0.5 0.6666667 0.6366667 0.75 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 20 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 21 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 22 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 23 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 24 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 25 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 26 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 27 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 28 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 29 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
## 30 0.3333333 0.8333333 0.8571429 0.8738095 1 1 0
extract the best number of trees and identify the best features for the random forest.
set.seed(1234)
store_maxtrees <- list()
for (ntree in c(10,15,20,25,30,35,40,45,50,55,100,150,200,250, 300,350,400)) {
set.seed(5678)
rf_maxtrees <- train(sample~.,
data = train,
method = "rf",
metric = "ROC",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = 5,
ntree = ntree)
key <- toString(ntree)
store_maxtrees[[key]] <- rf_maxtrees
}
results_tree <- resamples(store_maxtrees)
summary(results_tree)
##
## Call:
## summary.resamples(object = results_tree)
##
## Models: 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 150, 200, 250, 300, 350, 400
## Number of resamples: 50
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10 0.2500000 0.7135417 0.8680556 0.8093254 0.9444444 1 0
## 15 0.2500000 0.7232143 0.8680556 0.8130357 0.9375000 1 0
## 20 0.3333333 0.7313988 0.8888889 0.8320635 0.9459325 1 0
## 25 0.3888889 0.7797619 0.8750000 0.8325198 0.9444444 1 0
## 30 0.4166667 0.7797619 0.8819444 0.8406746 0.9553571 1 0
## 35 0.4166667 0.7413194 0.8750000 0.8357341 0.9459325 1 0
## 40 0.4166667 0.7725694 0.8750000 0.8381349 0.9503968 1 0
## 45 0.3888889 0.7500000 0.8750000 0.8350794 0.9548611 1 0
## 50 0.3888889 0.7410714 0.8750000 0.8358532 0.9459325 1 0
## 55 0.4166667 0.7162698 0.8750000 0.8368849 0.9627976 1 0
## 100 0.3333333 0.7658730 0.8660714 0.8400198 0.9642857 1 0
## 150 0.2777778 0.7658730 0.8750000 0.8328373 0.9627976 1 0
## 200 0.2777778 0.7777778 0.8750000 0.8367063 0.9642857 1 0
## 250 0.2777778 0.7777778 0.8750000 0.8426984 0.9910714 1 0
## 300 0.2777778 0.7857143 0.8908730 0.8448214 0.9910714 1 0
## 350 0.2777778 0.7812500 0.8819444 0.8417262 0.9910714 1 0
## 400 0.2777778 0.7797619 0.8908730 0.8436905 0.9910714 1 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10 0.25 0.5000000 0.6666667 0.6516667 0.75 1 0
## 15 0.00 0.5000000 0.6666667 0.6066667 0.75 1 0
## 20 0.00 0.3333333 0.6666667 0.6166667 0.75 1 0
## 25 0.00 0.3750000 0.6666667 0.6216667 0.75 1 0
## 30 0.00 0.5000000 0.6666667 0.6333333 0.75 1 0
## 35 0.00 0.3750000 0.6666667 0.6116667 0.75 1 0
## 40 0.00 0.5000000 0.6666667 0.6266667 0.75 1 0
## 45 0.00 0.3750000 0.6666667 0.6183333 0.75 1 0
## 50 0.00 0.5000000 0.6666667 0.6250000 0.75 1 0
## 55 0.00 0.5000000 0.6666667 0.6216667 0.75 1 0
## 100 0.00 0.5000000 0.6666667 0.6416667 0.75 1 0
## 150 0.00 0.5000000 0.6666667 0.6500000 0.75 1 0
## 200 0.00 0.5000000 0.6666667 0.6500000 0.75 1 0
## 250 0.00 0.5000000 0.6666667 0.6500000 0.75 1 0
## 300 0.00 0.5000000 0.6666667 0.6450000 0.75 1 0
## 350 0.00 0.5000000 0.6666667 0.6466667 0.75 1 0
## 400 0.00 0.5000000 0.6666667 0.6416667 0.75 1 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10 0.5000000 0.7142857 0.8333333 0.8238095 0.9642857 1 0
## 15 0.5000000 0.7142857 0.8452381 0.8447619 1.0000000 1 0
## 20 0.5000000 0.7440476 0.8571429 0.8514286 1.0000000 1 0
## 25 0.6666667 0.8333333 0.8571429 0.8828571 1.0000000 1 0
## 30 0.6666667 0.8333333 0.8571429 0.8714286 1.0000000 1 0
## 35 0.6666667 0.8333333 0.8571429 0.8676190 1.0000000 1 0
## 40 0.5000000 0.8333333 0.8571429 0.8552381 1.0000000 1 0
## 45 0.6666667 0.8333333 0.8571429 0.8685714 1.0000000 1 0
## 50 0.6666667 0.8333333 0.8571429 0.8619048 1.0000000 1 0
## 55 0.6666667 0.8333333 0.8571429 0.8776190 1.0000000 1 0
## 100 0.5000000 0.8333333 0.9285714 0.8833333 1.0000000 1 0
## 150 0.5000000 0.8333333 0.9285714 0.8923810 1.0000000 1 0
## 200 0.5000000 0.8333333 0.9285714 0.8957143 1.0000000 1 0
## 250 0.5000000 0.8333333 0.9285714 0.8923810 1.0000000 1 0
## 300 0.5000000 0.8333333 0.9285714 0.8957143 1.0000000 1 0
## 350 0.5000000 0.8333333 0.9285714 0.8923810 1.0000000 1 0
## 400 0.5000000 0.8333333 0.9285714 0.8957143 1.0000000 1 0
extract the best number of trees and identify the best features for the random forest.
set.seed(1234)
fit_rf <- train(sample~.,
data= train,
method = "rf",
metric = "ROC",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 55,
maxnodes = 6)
print(fit_rf$bestTune)
## mtry
## 1 12
res<-fit_rf$results
Make predictions using the test data set
probs_test <- predict(fit_rf,test,type="prob")
test$sample = as.factor(test$sample)
gbm.ROC_test<- roc(predictor=probs_test$steatosis,
response=test$sample)
## Setting levels: control = control, case = steatosis
## Setting direction: controls < cases
table(test$sample)
##
## control steatosis
## 16 27
print(gbm.ROC_test[["auc"]])
## Area under the curve: 0.919
Repeat the steps for the validation set
probs_val <- predict(fit_rf,validation,type="prob")
validation$sample=as.factor(validation$sample)
gbm.ROC_val<- roc(predictor=probs_val$steatosis,
response=validation$sample)
## Setting levels: control = normal, case = steatosis
## Setting direction: controls < cases
table(validation$sample)
##
## normal steatosis
## 13 19
print(gbm.ROC_val[["auc"]])
## Area under the curve: 0.7308
print(colnames(validation))
## [1] "sample" "GAPDH" "BTK" "PTPN6" "SERPING1" "BAX"
## [7] "ITGAE" "IL1RAP" "MSR1" "TNFRSF14" "IL15" "B2M"
## [13] "TLR1" "HPRT1" "CX3CR1" "TOLLIP" "C9" "IFIH1"
## [19] "C4BPA"
plot the ROC curve
plot(gbm.ROC_test,col ="darkred",main="Random Forest ROC curve",
col.lab="black", cex.lab=1.5)
text(x = 0.7445764,y =0.9701577,label="AUC:0.91",cex=1)
plot(gbm.ROC_val,col ="darkgreen",add=T)
text(x = 0.3274354,y = 0.7076923,label="AUC:0.73",cex=1)
legend(x = "bottomright",
c('Test-NC:16,NS:27','Validation-NC:13,NS:19','NC:Number of control','NS:Number of steatosis'),lty=c(1,1),
lwd=c(2,2),col=c('darkred','darkgreen','white','white'))
lets do the jitter plot for train and validation test.
train_data_plot<-as.data.frame(train)
train_data_plot$sample=as.factor(train_data_plot$sample)
a<-compare_means(c(C9,HPRT1,TLR1,B2M,BAX,GAPDH,BTK,PTPN6,SERPING1,ITGAE,IL1RAP,MSR1,TNFRSF14,IL15,CX3CR1,TOLLIP,IFIH1,C4BPA) ~ sample ,data = train_data_plot)
my_gene_list <- c("C9","HPRT1","TLR1","B2M","BAX","GAPDH","BTK","PTPN6","SERPING1","ITGAE","IL1RAP","MSR1","TNFRSF14","IL15","CX3CR1","TOLLIP","IFIH1","C4BPA")
my_plot_list <- vector(mode = "list", length = 18)
for (i in 1:18) {
my_comparisons <- list( c("Steatosis", "Control") )
p <-ggboxplot(train_data_plot, x = "sample", y = my_gene_list[i],
color = "sample", palette = "jco",add = "jitter")+
theme(axis.text=element_text(size=12,face="bold"),
axis.title=element_text(size=14,face="bold"))+
stat_compare_means(label.y = 16,size=5)
my_plot_list[[i]] <- p
}
print(my_plot_list[1:18])
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
validation_data_plot<-as.data.frame(validation)
validation_data_plot$sample=as.factor(validation_data_plot$sample)
compare_means(c(BTK,PTPN6,SERPING1,ITGAE,IL1RAP,MSR1,TNFRSF14,IL15,CX3CR1,TOLLIP,IFIH1,C4BPA) ~ sample ,data = validation_data_plot)
## # A tibble: 12 x 8
## .y. group1 group2 p p.adj p.format p.signif method
## <fct> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 BTK normal steatosis 0.00593 0.0059 0.00593 ** Wilcoxon
## 2 PTPN6 normal steatosis 0.650 0.65 0.64982 ns Wilcoxon
## 3 SERPING1 normal steatosis 0.120 0.12 0.12013 ns Wilcoxon
## 4 ITGAE normal steatosis 0.000143 0.00014 0.00014 *** Wilcoxon
## 5 IL1RAP normal steatosis 0.00103 0.001 0.00103 ** Wilcoxon
## 6 MSR1 normal steatosis 0.00166 0.0017 0.00166 ** Wilcoxon
## 7 TNFRSF14 normal steatosis 0.000744 0.00074 0.00074 *** Wilcoxon
## 8 IL15 normal steatosis 0.426 0.43 0.42591 ns Wilcoxon
## 9 CX3CR1 normal steatosis 0.000528 0.00053 0.00053 *** Wilcoxon
## 10 TOLLIP normal steatosis 0.0000497 0.00005 5e-05 **** Wilcoxon
## 11 IFIH1 normal steatosis 0.000744 0.00074 0.00074 *** Wilcoxon
## 12 C4BPA normal steatosis 0.00354 0.0035 0.00354 ** Wilcoxon
plot the bar graph for validation set
my_plot_list_2 <- vector(mode = "list", length = 18)
for (i in 1:18) {
my_comparisons <- list( c("Steatosis", "Control") )
p <-ggboxplot(validation_data_plot, x = "sample", y = my_gene_list[i],
color = "sample", palette = "jco",add = "jitter")+
theme(axis.text=element_text(size=12,face="bold"),
axis.title=element_text(size=14,face="bold"))+
stat_compare_means(label.y = 16,size=5)
my_plot_list_2[[i]] <- p
}
print(my_plot_list_2[1:18])
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
#stat_compare_means(label.y = 16,face="bold")
setEnrichrSite("Enrichr") # Human genes
websiteLive <- TRUE
dbs <- listEnrichrDbs()
dbs <- c("GO_Molecular_Function_2021", "GO_Cellular_Component_2021", "GO_Biological_Process_2021",
"Reactome_2021","KEGG_2021_Human")
genes_name<-colnames(train)
genes_name<-genes_name[2:nrow(train)]
if (websiteLive) {
enriched <- enrichr(genes_name, dbs)
}
## Uploading data to Enrichr... Done.
## Querying GO_Molecular_Function_2021... Done.
## Querying GO_Cellular_Component_2021... Done.
## Querying GO_Biological_Process_2021... Done.
## Querying Reactome_2021... Done.
## Querying KEGG_2021_Human... Done.
## Parsing results... Done.
if (websiteLive) datatable(enriched[["KEGG_2021_Human"]])
if (websiteLive) plotEnrich(enriched[[5]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value",title = "Enrichment analysis")
make the correlation
steatosis<-'steatosis'
steatosis_samples<-train[train$sample %in% steatosis, ]
steatosis_samples<-steatosis_samples[,-c(1)]
steatosis_samples <- cor(steatosis_samples)
corrplot(steatosis_samples,method="circle",cl.lim=c(-1,1),col=colorRampPalette(c("blue","white","red"))(200))
control<-'control'
control_samples<-train[train$sample %in% control, ]
control_samples<-control_samples[,-c(1)]
control_samples <- cor(control_samples)
corrplot(control_samples,method="circle",cl.lim=c(-1,1),col=colorRampPalette(c("blue","white","red"))(200))
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tibble_3.1.1 corrplot_0.89 DT_0.18
## [4] enrichR_3.0 ggsignif_0.6.1 ggpubr_0.4.0
## [7] pROC_1.17.0.1 dplyr_1.0.6 alookr_0.3.4
## [10] e1071_1.7-7 caret_6.0-88 ggplot2_3.3.3
## [13] lattice_0.20-41 randomForest_4.6-14
##
## loaded via a namespace (and not attached):
## [1] mlr_2.19.0 readxl_1.3.1 backports_1.2.1
## [4] fastmatch_1.1-0 Hmisc_4.5-0 systemfonts_1.0.2
## [7] plyr_1.8.6 lazyeval_0.2.2 splines_4.0.3
## [10] crosstalk_1.1.1 digest_0.6.27 foreach_1.5.1
## [13] htmltools_0.5.1.1 fansi_0.4.2 magrittr_2.0.1
## [16] checkmate_2.0.0 BBmisc_1.11 unbalanced_2.0
## [19] doParallel_1.0.16 cluster_2.1.0 openxlsx_4.2.3
## [22] recipes_0.1.16 gower_0.2.2 extrafont_0.17
## [25] sandwich_3.0-1 extrafontdb_1.0 svglite_2.0.0
## [28] jpeg_0.1-8.1 colorspace_2.0-1 rvest_1.0.0
## [31] ggrepel_0.9.1 haven_2.4.1 xfun_0.23
## [34] crayon_1.4.1 jsonlite_1.7.2 libcoin_1.0-8
## [37] survival_3.2-7 zoo_1.8-9 iterators_1.0.13
## [40] glue_1.4.2 kableExtra_1.3.4 gtable_0.3.0
## [43] ipred_0.9-11 webshot_0.5.2 car_3.0-10
## [46] Rttf2pt1_1.3.8 abind_1.4-5 scales_1.1.1
## [49] mvtnorm_1.1-2 DBI_1.1.1 rstatix_0.7.0
## [52] Rcpp_1.0.6 viridisLite_0.4.0 htmlTable_2.2.1
## [55] foreign_0.8-80 proxy_0.4-26 Formula_1.2-4
## [58] stats4_4.0.3 lava_1.6.9 prodlim_2019.11.13
## [61] htmlwidgets_1.5.3 httr_1.4.2 FNN_1.1.3
## [64] RColorBrewer_1.1-2 ellipsis_0.3.2 farver_2.1.0
## [67] ParamHelpers_1.14 pkgconfig_2.0.3 nnet_7.3-14
## [70] sass_0.4.0 utf8_1.2.1 labeling_0.4.2
## [73] tidyselect_1.1.1 rlang_0.4.10 reshape2_1.4.4
## [76] munsell_0.5.0 cellranger_1.1.0 tools_4.0.3
## [79] xgboost_1.4.1.1 cli_2.5.0 generics_0.1.0
## [82] ranger_0.12.1 broom_0.7.6 evaluate_0.14
## [85] stringr_1.4.0 yaml_2.2.1 ModelMetrics_1.2.2.2
## [88] knitr_1.33 zip_2.1.1 ggmosaic_0.3.3
## [91] caTools_1.18.2 RANN_2.6.1 purrr_0.3.4
## [94] nlme_3.1-149 xml2_1.3.2 compiler_4.0.3
## [97] rstudioapi_0.13 plotly_4.9.4 curl_4.3.1
## [100] png_0.1-7 bslib_0.2.5.1 stringi_1.5.3
## [103] highr_0.9 forcats_0.5.1 gdtools_0.2.3
## [106] hrbrthemes_0.8.0 Matrix_1.2-18 dlookr_0.4.5
## [109] ggsci_2.9 vctrs_0.3.8 RcmdrMisc_2.7-1
## [112] pillar_1.6.1 lifecycle_1.0.0 jquerylib_0.1.4
## [115] data.table_1.14.0 bitops_1.0-7 R6_2.5.0
## [118] latticeExtra_0.6-29 gridExtra_2.3 rio_0.5.26
## [121] codetools_0.2-16 MASS_7.3-53 assertthat_0.2.1
## [124] rjson_0.2.20 withr_2.4.2 nortest_1.0-4
## [127] parallel_4.0.3 hms_1.1.0 grid_4.0.3
## [130] prettydoc_0.4.1 rpart_4.1-15 timeDate_3043.102
## [133] tidyr_1.1.3 class_7.3-17 rmarkdown_2.8
## [136] inum_1.0-4 carData_3.0-4 parallelMap_1.5.0
## [139] partykit_1.2-13 lubridate_1.7.10 base64enc_0.1-3